1 /*
2 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25 /*
26 *******************************************************************************
27 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
28 * *
29 * The original version of this source code and documentation is copyrighted *
30 * and owned by IBM, These materials are provided under terms of a License *
31 * Agreement between IBM and Sun. This technology is protected by multiple *
32 * US and International patents. This notice and attribution to IBM may not *
33 * to removed. *
34 *******************************************************************************
35 */
36
37 package sun.text.normalizer;
38
39 import java.io.BufferedInputStream;
40 import java.io.ByteArrayInputStream;
41 import java.io.IOException;
42 import java.io.BufferedInputStream;
43 import java.io.InputStream;
44
45 /**
46 * @author Ram Viswanadha
47 */
48 public final class NormalizerImpl {
49 // Static block for the class to initialize its own self
50 static final NormalizerImpl IMPL;
51
52 static
53 {
54 try
55 {
56 IMPL = new NormalizerImpl();
57 }
58 catch (Exception e)
59 {
60 throw new RuntimeException(e.getMessage());
61 }
62 }
63
64 static final int UNSIGNED_BYTE_MASK =0xFF;
65 static final long UNSIGNED_INT_MASK = 0xffffffffL;
66 /*
67 * This new implementation of the normalization code loads its data from
68 * unorm.icu, which is generated with the gennorm tool.
69 * The format of that file is described at the end of this file.
70 */
71 private static final String DATA_FILE_NAME = "/sun/text/resources/unorm.icu";
72
73 // norm32 value constants
74
75 // quick check flags 0..3 set mean "no" for their forms
76 public static final int QC_NFC=0x11; /* no|maybe */
77 public static final int QC_NFKC=0x22; /* no|maybe */
78 public static final int QC_NFD=4; /* no */
79 public static final int QC_NFKD=8; /* no */
80
81 public static final int QC_ANY_NO=0xf;
82
83 /* quick check flags 4..5 mean "maybe" for their forms;
84 * test flags>=QC_MAYBE
85 */
86 public static final int QC_MAYBE=0x10;
87 public static final int QC_ANY_MAYBE=0x30;
88
89 public static final int QC_MASK=0x3f;
90
91 private static final int COMBINES_FWD=0x40;
92 private static final int COMBINES_BACK=0x80;
93 public static final int COMBINES_ANY=0xc0;
94 // UnicodeData.txt combining class in bits 15.
95 private static final int CC_SHIFT=8;
96 public static final int CC_MASK=0xff00;
97 // 16 bits for the index to UChars and other extra data
98 private static final int EXTRA_SHIFT=16;
99
100 /* norm32 value constants using >16 bits */
101 private static final long MIN_SPECIAL = (long)(0xfc000000 & UNSIGNED_INT_MASK);
102 private static final long SURROGATES_TOP = (long)(0xfff00000 & UNSIGNED_INT_MASK);
103 private static final long MIN_HANGUL = (long)(0xfff00000 & UNSIGNED_INT_MASK);
104 // private static final long MIN_JAMO_V = (long)(0xfff20000 & UNSIGNED_INT_MASK);
105 private static final long JAMO_V_TOP = (long)(0xfff30000 & UNSIGNED_INT_MASK);
106
107
108 /* indexes[] value names */
109 /* number of bytes in normalization trie */
110 static final int INDEX_TRIE_SIZE = 0;
111 /* number of chars in extra data */
112 static final int INDEX_CHAR_COUNT = 1;
113 /* number of uint16_t words for combining data */
114 static final int INDEX_COMBINE_DATA_COUNT = 2;
115 /* first code point with quick check NFC NO/MAYBE */
116 public static final int INDEX_MIN_NFC_NO_MAYBE = 6;
117 /* first code point with quick check NFKC NO/MAYBE */
118 public static final int INDEX_MIN_NFKC_NO_MAYBE = 7;
119 /* first code point with quick check NFD NO/MAYBE */
120 public static final int INDEX_MIN_NFD_NO_MAYBE = 8;
121 /* first code point with quick check NFKD NO/MAYBE */
122 public static final int INDEX_MIN_NFKD_NO_MAYBE = 9;
123 /* number of bytes in FCD trie */
124 static final int INDEX_FCD_TRIE_SIZE = 10;
125 /* number of bytes in the auxiliary trie */
126 static final int INDEX_AUX_TRIE_SIZE = 11;
127 /* changing this requires a new formatVersion */
128 static final int INDEX_TOP = 32;
129
130
131 /* AUX constants */
132 /* value constants for auxTrie */
133 private static final int AUX_UNSAFE_SHIFT = 11;
134 private static final int AUX_COMP_EX_SHIFT = 10;
135 private static final int AUX_NFC_SKIPPABLE_F_SHIFT = 12;
136
137 private static final int AUX_MAX_FNC = ((int)1<<AUX_COMP_EX_SHIFT);
138 private static final int AUX_UNSAFE_MASK = (int)((1<<AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK);
139 private static final int AUX_FNC_MASK = (int)((AUX_MAX_FNC-1) & UNSIGNED_INT_MASK);
140 private static final int AUX_COMP_EX_MASK = (int)((1<<AUX_COMP_EX_SHIFT) & UNSIGNED_INT_MASK);
141 private static final long AUX_NFC_SKIP_F_MASK = ((UNSIGNED_INT_MASK&1)<<AUX_NFC_SKIPPABLE_F_SHIFT);
142
143 private static final int MAX_BUFFER_SIZE = 20;
144
145 /*******************************/
146
147 /* Wrappers for Trie implementations */
148 static final class NormTrieImpl implements Trie.DataManipulate{
149 static IntTrie normTrie= null;
150 /**
151 * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
152 * data the index array offset of the indexes for that lead surrogate.
153 * @param property data value for a surrogate from the trie, including
154 * the folding offset
155 * @return data offset or 0 if there is no data for the lead surrogate
156 */
157 /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
158 public int getFoldingOffset(int value){
159 return BMP_INDEX_LENGTH+
160 ((value>>(EXTRA_SHIFT-SURROGATE_BLOCK_BITS))&
161 (0x3ff<<SURROGATE_BLOCK_BITS));
162 }
163
164 }
165 static final class FCDTrieImpl implements Trie.DataManipulate{
166 static CharTrie fcdTrie=null;
167 /**
168 * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
169 * data the index array offset of the indexes for that lead surrogate.
170 * @param property data value for a surrogate from the trie, including
171 * the folding offset
172 * @return data offset or 0 if there is no data for the lead surrogate
173 */
174 /* fcdTrie: the folding offset is the lead FCD value itself */
175 public int getFoldingOffset(int value){
176 return value;
177 }
178 }
179
180 static final class AuxTrieImpl implements Trie.DataManipulate{
181 static CharTrie auxTrie = null;
182 /**
183 * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
184 * data the index array offset of the indexes for that lead surrogate.
185 * @param property data value for a surrogate from the trie, including
186 * the folding offset
187 * @return data offset or 0 if there is no data for the lead surrogate
188 */
189 /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
190 public int getFoldingOffset(int value){
191 return (int)(value &AUX_FNC_MASK)<<SURROGATE_BLOCK_BITS;
192 }
193 }
194
195 /****************************************************/
196
197
198 private static FCDTrieImpl fcdTrieImpl;
199 private static NormTrieImpl normTrieImpl;
200 private static AuxTrieImpl auxTrieImpl;
201 private static int[] indexes;
202 private static char[] combiningTable;
203 private static char[] extraData;
204
205 private static boolean isDataLoaded;
206 private static boolean isFormatVersion_2_1;
207 private static boolean isFormatVersion_2_2;
208 private static byte[] unicodeVersion;
209
210 /**
211 * Default buffer size of datafile
212 */
213 private static final int DATA_BUFFER_SIZE = 25000;
214
215 /**
216 * FCD check: everything below this code point is known to have a 0
217 * lead combining class
218 */
219 public static final int MIN_WITH_LEAD_CC=0x300;
220
221
222 /**
223 * Bit 7 of the length byte for a decomposition string in extra data is
224 * a flag indicating whether the decomposition string is
225 * preceded by a 16-bit word with the leading and trailing cc
226 * of the decomposition (like for A-umlaut);
227 * if not, then both cc's are zero (like for compatibility ideographs).
228 */
229 private static final int DECOMP_FLAG_LENGTH_HAS_CC=0x80;
230 /**
231 * Bits 6..0 of the length byte contain the actual length.
232 */
233 private static final int DECOMP_LENGTH_MASK=0x7f;
234
235 /** Length of the BMP portion of the index (stage 1) array. */
236 private static final int BMP_INDEX_LENGTH=0x10000>>Trie.INDEX_STAGE_1_SHIFT_;
237 /** Number of bits of a trail surrogate that are used in index table
238 * lookups.
239 */
240 private static final int SURROGATE_BLOCK_BITS=10-Trie.INDEX_STAGE_1_SHIFT_;
241
242
243 // public utility
244 public static int getFromIndexesArr(int index){
245 return indexes[index];
246 }
247
248 // protected constructor ---------------------------------------------
249
250 /**
251 * Constructor
252 * @exception thrown when data reading fails or data corrupted
253 */
254 private NormalizerImpl() throws IOException {
255 //data should be loaded only once
256 if(!isDataLoaded){
257
258 // jar access
259 InputStream i = ICUData.getRequiredStream(DATA_FILE_NAME);
260 BufferedInputStream b = new BufferedInputStream(i,DATA_BUFFER_SIZE);
261 NormalizerDataReader reader = new NormalizerDataReader(b);
262
263 // read the indexes
264 indexes = reader.readIndexes(NormalizerImpl.INDEX_TOP);
265
266 byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]];
267
268 int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT];
269 combiningTable = new char[combiningTableTop];
270
271 int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT];
272 extraData = new char[extraDataTop];
273
274 byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]];
275 byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]];
276
277 fcdTrieImpl = new FCDTrieImpl();
278 normTrieImpl = new NormTrieImpl();
279 auxTrieImpl = new AuxTrieImpl();
280
281 // load the rest of the data data and initialize the data members
282 reader.read(normBytes, fcdBytes,auxBytes, extraData, combiningTable);
283
284 NormTrieImpl.normTrie = new IntTrie( new ByteArrayInputStream(normBytes),normTrieImpl );
285 FCDTrieImpl.fcdTrie = new CharTrie( new ByteArrayInputStream(fcdBytes),fcdTrieImpl );
286 AuxTrieImpl.auxTrie = new CharTrie( new ByteArrayInputStream(auxBytes),auxTrieImpl );
287
288 // we reached here without any exceptions so the data is fully
289 // loaded set the variable to true
290 isDataLoaded = true;
291
292 // get the data format version
293 byte[] formatVersion = reader.getDataFormatVersion();
294
295 isFormatVersion_2_1 =( formatVersion[0]>2
296 ||
297 (formatVersion[0]==2 && formatVersion[1]>=1)
298 );
299 isFormatVersion_2_2 =( formatVersion[0]>2
300 ||
301 (formatVersion[0]==2 && formatVersion[1]>=2)
302 );
303 unicodeVersion = reader.getUnicodeVersion();
304 b.close();
305 }
306 }
307
308 /* ---------------------------------------------------------------------- */
309
310 /* Korean Hangul and Jamo constants */
311
312 public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */
313 public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */
314 public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */
315
316 public static final int HANGUL_BASE=0xac00;
317
318 public static final int JAMO_L_COUNT=19;
319 public static final int JAMO_V_COUNT=21;
320 public static final int JAMO_T_COUNT=28;
321 public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
322
323 private static boolean isHangulWithoutJamoT(char c) {
324 c-=HANGUL_BASE;
325 return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
326 }
327
328 /* norm32 helpers */
329
330 /* is this a norm32 with a regular index? */
331 private static boolean isNorm32Regular(long norm32) {
332 return norm32<MIN_SPECIAL;
333 }
334
335 /* is this a norm32 with a special index for a lead surrogate? */
336 private static boolean isNorm32LeadSurrogate(long norm32) {
337 return MIN_SPECIAL<=norm32 && norm32<SURROGATES_TOP;
338 }
339
340 /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
341 private static boolean isNorm32HangulOrJamo(long norm32) {
342 return norm32>=MIN_HANGUL;
343 }
344
345 /*
346 * Given norm32 for Jamo V or T,
347 * is this a Jamo V?
348 */
349 private static boolean isJamoVTNorm32JamoV(long norm32) {
350 return norm32<JAMO_V_TOP;
351 }
352
353 /* data access primitives ----------------------------------------------- */
354
355 public static long/*unsigned*/ getNorm32(char c) {
356 return ((UNSIGNED_INT_MASK) & (NormTrieImpl.normTrie.getLeadValue(c)));
357 }
358
359 public static long/*unsigned*/ getNorm32FromSurrogatePair(long norm32,
360 char c2) {
361 /*
362 * the surrogate index in norm32 stores only the number of the surrogate
363 * index block see gennorm/store.c/getFoldedNormValue()
364 */
365 return ((UNSIGNED_INT_MASK) &
366 NormTrieImpl.normTrie.getTrailValue((int)norm32, c2));
367 }
368 ///CLOVER:OFF
369 private static long getNorm32(int c){
370 return (UNSIGNED_INT_MASK&(NormTrieImpl.normTrie.getCodePointValue(c)));
371 }
372
373 /*
374 * get a norm32 from text with complete code points
375 * (like from decompositions)
376 */
377 private static long/*unsigned*/ getNorm32(char[] p,int start,
378 int/*unsigned*/ mask) {
379 long/*unsigned*/ norm32= getNorm32(p[start]);
380 if(((norm32&mask)>0) && isNorm32LeadSurrogate(norm32)) {
381 /* *p is a lead surrogate, get the real norm32 */
382 norm32=getNorm32FromSurrogatePair(norm32, p[start+1]);
383 }
384 return norm32;
385 }
386
387 //// for StringPrep
388 public static VersionInfo getUnicodeVersion(){
389 return VersionInfo.getInstance(unicodeVersion[0], unicodeVersion[1],
390 unicodeVersion[2], unicodeVersion[3]);
391 }
392
393 public static char getFCD16(char c) {
394 return FCDTrieImpl.fcdTrie.getLeadValue(c);
395 }
396
397 public static char getFCD16FromSurrogatePair(char fcd16, char c2) {
398 /* the surrogate index in fcd16 is an absolute offset over the
399 * start of stage 1
400 * */
401 return FCDTrieImpl.fcdTrie.getTrailValue(fcd16, c2);
402 }
403 public static int getFCD16(int c) {
404 return FCDTrieImpl.fcdTrie.getCodePointValue(c);
405 }
406
407 private static int getExtraDataIndex(long norm32) {
408 return (int)(norm32>>EXTRA_SHIFT);
409 }
410
411 private static final class DecomposeArgs{
412 int /*unsigned byte*/ cc;
413 int /*unsigned byte*/ trailCC;
414 int length;
415 }
416 /**
417 *
418 * get the canonical or compatibility decomposition for one character
419 *
420 * @return index into the extraData array
421 */
422 private static int/*index*/ decompose(long/*unsigned*/ norm32,
423 int/*unsigned*/ qcMask,
424 DecomposeArgs args) {
425 int p= getExtraDataIndex(norm32);
426 args.length=extraData[p++];
427
428 if((norm32&qcMask&QC_NFKD)!=0 && args.length>=0x100) {
429 /* use compatibility decomposition, skip canonical data */
430 p+=((args.length>>7)&1)+(args.length&DECOMP_LENGTH_MASK);
431 args.length>>=8;
432 }
433
434 if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) {
435 /* get the lead and trail cc's */
436 char bothCCs=extraData[p++];
437 args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8);
438 args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs;
439 } else {
440 /* lead and trail cc's are both 0 */
441 args.cc=args.trailCC=0;
442 }
443
444 args.length&=DECOMP_LENGTH_MASK;
445 return p;
446 }
447
448
449 /**
450 * get the canonical decomposition for one character
451 * @return index into the extraData array
452 */
453 private static int decompose(long/*unsigned*/ norm32,
454 DecomposeArgs args) {
455
456 int p= getExtraDataIndex(norm32);
457 args.length=extraData[p++];
458
459 if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) {
460 /* get the lead and trail cc's */
461 char bothCCs=extraData[p++];
462 args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8);
463 args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs;
464 } else {
465 /* lead and trail cc's are both 0 */
466 args.cc=args.trailCC=0;
467 }
468
469 args.length&=DECOMP_LENGTH_MASK;
470 return p;
471 }
472
473
474 private static final class NextCCArgs{
475 char[] source;
476 int next;
477 int limit;
478 char c;
479 char c2;
480 }
481
482 /*
483 * get the combining class of (c, c2)= args.source[args.next++]
484 * before: args.next<args.limit after: args.next<=args.limit
485 * if only one code unit is used, then c2==0
486 */
487 private static int /*unsigned byte*/ getNextCC(NextCCArgs args) {
488 long /*unsigned*/ norm32;
489
490 args.c=args.source[args.next++];
491
492 norm32= getNorm32(args.c);
493 if((norm32 & CC_MASK)==0) {
494 args.c2=0;
495 return 0;
496 } else {
497 if(!isNorm32LeadSurrogate(norm32)) {
498 args.c2=0;
499 } else {
500 /* c is a lead surrogate, get the real norm32 */
501 if(args.next!=args.limit &&
502 UTF16.isTrailSurrogate(args.c2=args.source[args.next])){
503 ++args.next;
504 norm32=getNorm32FromSurrogatePair(norm32, args.c2);
505 } else {
506 args.c2=0;
507 return 0;
508 }
509 }
510
511 return (int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT));
512 }
513 }
514
515 private static final class PrevArgs{
516 char[] src;
517 int start;
518 int current;
519 char c;
520 char c2;
521 }
522
523 /*
524 * read backwards and get norm32
525 * return 0 if the character is <minC
526 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
527 * surrogate but read second!)
528 */
529 private static long /*unsigned*/ getPrevNorm32(PrevArgs args,
530 int/*unsigned*/ minC,
531 int/*unsigned*/ mask) {
532 long/*unsigned*/ norm32;
533
534 args.c=args.src[--args.current];
535 args.c2=0;
536
537 /* check for a surrogate before getting norm32 to see if we need to
538 * predecrement further
539 */
540 if(args.c<minC) {
541 return 0;
542 } else if(!UTF16.isSurrogate(args.c)) {
543 return getNorm32(args.c);
544 } else if(UTF16.isLeadSurrogate(args.c)) {
545 /* unpaired first surrogate */
546 return 0;
547 } else if(args.current!=args.start &&
548 UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) {
549 --args.current;
550 norm32=getNorm32(args.c2);
551
552 if((norm32&mask)==0) {
553 /* all surrogate pairs with this lead surrogate have
554 * only irrelevant data
555 */
556 return 0;
557 } else {
558 /* norm32 must be a surrogate special */
559 return getNorm32FromSurrogatePair(norm32, args.c);
560 }
561 } else {
562 /* unpaired second surrogate */
563 args.c2=0;
564 return 0;
565 }
566 }
567
568 /*
569 * get the combining class of (c, c2)=*--p
570 * before: start<p after: start<=p
571 */
572 private static int /*unsigned byte*/ getPrevCC(PrevArgs args) {
573
574 return (int)((UNSIGNED_BYTE_MASK)&(getPrevNorm32(args, MIN_WITH_LEAD_CC,
575 CC_MASK)>>CC_SHIFT));
576 }
577
578 /*
579 * is this a safe boundary character for NF*D?
580 * (lead cc==0)
581 */
582 public static boolean isNFDSafe(long/*unsigned*/ norm32,
583 int/*unsigned*/ccOrQCMask,
584 int/*unsigned*/ decompQCMask) {
585 if((norm32&ccOrQCMask)==0) {
586 return true; /* cc==0 and no decomposition: this is NF*D safe */
587 }
588
589 /* inspect its decomposition - maybe a Hangul but not a surrogate here*/
590 if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) {
591 DecomposeArgs args=new DecomposeArgs();
592 /* decomposes, get everything from the variable-length extra data */
593 decompose(norm32, decompQCMask, args);
594 return args.cc==0;
595 } else {
596 /* no decomposition (or Hangul), test the cc directly */
597 return (norm32&CC_MASK)==0;
598 }
599 }
600
601 /*
602 * is this (or does its decomposition begin with) a "true starter"?
603 * (cc==0 and NF*C_YES)
604 */
605 public static boolean isTrueStarter(long/*unsigned*/ norm32,
606 int/*unsigned*/ ccOrQCMask,
607 int/*unsigned*/ decompQCMask) {
608 if((norm32&ccOrQCMask)==0) {
609 return true; /* this is a true starter (could be Hangul or Jamo L)*/
610 }
611
612 /* inspect its decomposition - not a Hangul or a surrogate here */
613 if((norm32&decompQCMask)!=0) {
614 int p; /* index into extra data array */
615 DecomposeArgs args=new DecomposeArgs();
616 /* decomposes, get everything from the variable-length extra data */
617 p=decompose(norm32, decompQCMask, args);
618
619 if(args.cc==0) {
620 int/*unsigned*/ qcMask=ccOrQCMask&QC_MASK;
621
622 /* does it begin with NFC_YES? */
623 if((getNorm32(extraData,p, qcMask)&qcMask)==0) {
624 /* yes, the decomposition begins with a true starter */
625 return true;
626 }
627 }
628 }
629 return false;
630 }
631
632 /* reorder UTF-16 in-place ---------------------------------------------- */
633
634 /**
635 * simpler, single-character version of mergeOrdered() -
636 * bubble-insert one single code point into the preceding string
637 * which is already canonically ordered
638 * (c, c2) may or may not yet have been inserted at src[current]..src[p]
639 *
640 * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
641 *
642 * before: src[start]..src[current] is already ordered, and
643 * src[current]..src[p] may or may not hold (c, c2) but
644 * must be exactly the same length as (c, c2)
645 * after: src[start]..src[p] is ordered
646 *
647 * @return the trailing combining class
648 */
649 private static int/*unsigned byte*/ insertOrdered(char[] source,
650 int start,
651 int current, int p,
652 char c, char c2,
653 int/*unsigned byte*/ cc) {
654 int back, preBack;
655 int r;
656 int prevCC, trailCC=cc;
657
658 if(start<current && cc!=0) {
659 // search for the insertion point where cc>=prevCC
660 preBack=back=current;
661 PrevArgs prevArgs = new PrevArgs();
662 prevArgs.current = current;
663 prevArgs.start = start;
664 prevArgs.src = source;
665 // get the prevCC
666 prevCC=getPrevCC(prevArgs);
667 preBack = prevArgs.current;
668
669 if(cc<prevCC) {
670 // this will be the last code point, so keep its cc
671 trailCC=prevCC;
672 back=preBack;
673 while(start<preBack) {
674 prevCC=getPrevCC(prevArgs);
675 preBack=prevArgs.current;
676 if(cc>=prevCC) {
677 break;
678 }
679 back=preBack;
680 }
681
682
683 // this is where we are right now with all these indicies:
684 // [start]..[pPreBack] 0..? code points that we can ignore
685 // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc
686 // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
687 // [current]..[p] 1 code point (c, c2) with cc
688
689 // move the code units in between up
690 r=p;
691 do {
692 source[--r]=source[--current];
693 } while(back!=current);
694 }
695 }
696
697 // insert (c, c2)
698 source[current]=c;
699 if(c2!=0) {
700 source[(current+1)]=c2;
701 }
702
703 // we know the cc of the last code point
704 return trailCC;
705 }
706
707 /**
708 * merge two UTF-16 string parts together
709 * to canonically order (order by combining classes) their concatenation
710 *
711 * the two strings may already be adjacent, so that the merging is done
712 * in-place if the two strings are not adjacent, then the buffer holding the
713 * first one must be large enough
714 * the second string may or may not be ordered in itself
715 *
716 * before: [start]..[current] is already ordered, and
717 * [next]..[limit] may be ordered in itself, but
718 * is not in relation to [start..current[
719 * after: [start..current+(limit-next)[ is ordered
720 *
721 * the algorithm is a simple bubble-sort that takes the characters from
722 * src[next++] and inserts them in correct combining class order into the
723 * preceding part of the string
724 *
725 * since this function is called much less often than the single-code point
726 * insertOrdered(), it just uses that for easier maintenance
727 *
728 * @return the trailing combining class
729 */
730 private static int /*unsigned byte*/ mergeOrdered(char[] source,
731 int start,
732 int current,
733 char[] data,
734 int next,
735 int limit,
736 boolean isOrdered) {
737 int r;
738 int /*unsigned byte*/ cc, trailCC=0;
739 boolean adjacent;
740
741 adjacent= current==next;
742 NextCCArgs ncArgs = new NextCCArgs();
743 ncArgs.source = data;
744 ncArgs.next = next;
745 ncArgs.limit = limit;
746
747 if(start!=current || !isOrdered) {
748
749 while(ncArgs.next<ncArgs.limit) {
750 cc=getNextCC(ncArgs);
751 if(cc==0) {
752 // does not bubble back
753 trailCC=0;
754 if(adjacent) {
755 current=ncArgs.next;
756 } else {
757 data[current++]=ncArgs.c;
758 if(ncArgs.c2!=0) {
759 data[current++]=ncArgs.c2;
760 }
761 }
762 if(isOrdered) {
763 break;
764 } else {
765 start=current;
766 }
767 } else {
768 r=current+(ncArgs.c2==0 ? 1 : 2);
769 trailCC=insertOrdered(source,start, current, r,
770 ncArgs.c, ncArgs.c2, cc);
771 current=r;
772 }
773 }
774 }
775
776 if(ncArgs.next==ncArgs.limit) {
777 // we know the cc of the last code point
778 return trailCC;
779 } else {
780 if(!adjacent) {
781 // copy the second string part
782 do {
783 source[current++]=data[ncArgs.next++];
784 } while(ncArgs.next!=ncArgs.limit);
785 ncArgs.limit=current;
786 }
787 PrevArgs prevArgs = new PrevArgs();
788 prevArgs.src = data;
789 prevArgs.start = start;
790 prevArgs.current = ncArgs.limit;
791 return getPrevCC(prevArgs);
792 }
793
794 }
795 private static int /*unsigned byte*/ mergeOrdered(char[] source,
796 int start,
797 int current,
798 char[] data,
799 final int next,
800 final int limit) {
801 return mergeOrdered(source,start,current,data,next,limit,true);
802 }
803
804 public static NormalizerBase.QuickCheckResult quickCheck(char[] src,
805 int srcStart,
806 int srcLimit,
807 int minNoMaybe,
808 int qcMask,
809 int options,
810 boolean allowMaybe,
811 UnicodeSet nx){
812
813 int ccOrQCMask;
814 long norm32;
815 char c, c2;
816 char cc, prevCC;
817 long qcNorm32;
818 NormalizerBase.QuickCheckResult result;
819 ComposePartArgs args = new ComposePartArgs();
820 char[] buffer ;
821 int start = srcStart;
822
823 if(!isDataLoaded) {
824 return NormalizerBase.MAYBE;
825 }
826 // initialize
827 ccOrQCMask=CC_MASK|qcMask;
828 result=NormalizerBase.YES;
829 prevCC=0;
830
831 for(;;) {
832 for(;;) {
833 if(srcStart==srcLimit) {
834 return result;
835 } else if((c=src[srcStart++])>=minNoMaybe &&
836 (( norm32=getNorm32(c)) & ccOrQCMask)!=0) {
837 break;
838 }
839 prevCC=0;
840 }
841
842
843 // check one above-minimum, relevant code unit
844 if(isNorm32LeadSurrogate(norm32)) {
845 // c is a lead surrogate, get the real norm32
846 if(srcStart!=srcLimit&& UTF16.isTrailSurrogate(c2=src[srcStart])) {
847 ++srcStart;
848 norm32=getNorm32FromSurrogatePair(norm32,c2);
849 } else {
850 norm32=0;
851 c2=0;
852 }
853 }else{
854 c2=0;
855 }
856 if(nx_contains(nx, c, c2)) {
857 /* excluded: norm32==0 */
858 norm32=0;
859 }
860
861 // check the combining order
862 cc=(char)((norm32>>CC_SHIFT)&0xFF);
863 if(cc!=0 && cc<prevCC) {
864 return NormalizerBase.NO;
865 }
866 prevCC=cc;
867
868 // check for "no" or "maybe" quick check flags
869 qcNorm32 = norm32 & qcMask;
870 if((qcNorm32& QC_ANY_NO)>=1) {
871 result= NormalizerBase.NO;
872 break;
873 } else if(qcNorm32!=0) {
874 // "maybe" can only occur for NFC and NFKC
875 if(allowMaybe){
876 result=NormalizerBase.MAYBE;
877 }else{
878 // normalize a section around here to see if it is really
879 // normalized or not
880 int prevStarter;
881 int/*unsigned*/ decompQCMask;
882
883 decompQCMask=(qcMask<<2)&0xf; // decomposition quick check mask
884
885 // find the previous starter
886
887 // set prevStarter to the beginning of the current character
888 prevStarter=srcStart-1;
889 if(UTF16.isTrailSurrogate(src[prevStarter])) {
890 // safe because unpaired surrogates do not result
891 // in "maybe"
892 --prevStarter;
893 }
894 prevStarter=findPreviousStarter(src, start, prevStarter,
895 ccOrQCMask, decompQCMask,
896 (char)minNoMaybe);
897
898 // find the next true starter in [src..limit[ - modifies
899 // src to point to the next starter
900 srcStart=findNextStarter(src,srcStart, srcLimit, qcMask,
901 decompQCMask,(char) minNoMaybe);
902
903 //set the args for compose part
904 args.prevCC = prevCC;
905
906 // decompose and recompose [prevStarter..src[
907 buffer = composePart(args,prevStarter,src,srcStart,srcLimit,options,nx);
908
909 // compare the normalized version with the original
910 if(0!=strCompare(buffer,0,args.length,src,prevStarter,srcStart, false)) {
911 result=NormalizerBase.NO; // normalization differs
912 break;
913 }
914
915 // continue after the next starter
916 }
917 }
918 }
919 return result;
920 }
921
922
923 //------------------------------------------------------
924 // make NFD & NFKD
925 //------------------------------------------------------
926
927 public static int decompose(char[] src,int srcStart,int srcLimit,
928 char[] dest,int destStart,int destLimit,
929 boolean compat,int[] outTrailCC,
930 UnicodeSet nx) {
931
932 char[] buffer = new char[3];
933 int prevSrc;
934 long norm32;
935 int ccOrQCMask, qcMask;
936 int reorderStartIndex, length;
937 char c, c2, minNoMaybe;
938 int/*unsigned byte*/ cc, prevCC, trailCC;
939 char[] p;
940 int pStart;
941 int destIndex = destStart;
942 int srcIndex = srcStart;
943 if(!compat) {
944 minNoMaybe=(char)indexes[INDEX_MIN_NFD_NO_MAYBE];
945 qcMask=QC_NFD;
946 } else {
947 minNoMaybe=(char)indexes[INDEX_MIN_NFKD_NO_MAYBE];
948 qcMask=QC_NFKD;
949 }
950
951 /* initialize */
952 ccOrQCMask=CC_MASK|qcMask;
953 reorderStartIndex=0;
954 prevCC=0;
955 norm32=0;
956 c=0;
957 pStart=0;
958
959 cc=trailCC=-1;//initialize to bogus value
960
961 for(;;) {
962 /* count code units below the minimum or with irrelevant data for
963 * the quick check
964 */
965 prevSrc=srcIndex;
966
967 while(srcIndex!=srcLimit &&((c=src[srcIndex])<minNoMaybe ||
968 ((norm32=getNorm32(c))&ccOrQCMask)==0)){
969 prevCC=0;
970 ++srcIndex;
971 }
972
973 /* copy these code units all at once */
974 if(srcIndex!=prevSrc) {
975 length=(int)(srcIndex-prevSrc);
976 if((destIndex+length)<=destLimit) {
977 System.arraycopy(src,prevSrc,dest,destIndex,length);
978 }
979
980 destIndex+=length;
981 reorderStartIndex=destIndex;
982 }
983
984 /* end of source reached? */
985 if(srcIndex==srcLimit) {
986 break;
987 }
988
989 /* c already contains *src and norm32 is set for it, increment src*/
990 ++srcIndex;
991
992 /* check one above-minimum, relevant code unit */
993 /*
994 * generally, set p and length to the decomposition string
995 * in simple cases, p==NULL and (c, c2) will hold the length code
996 * units to append in all cases, set cc to the lead and trailCC to
997 * the trail combining class
998 *
999 * the following merge-sort of the current character into the
1000 * preceding, canonically ordered result text will use the
1001 * optimized insertOrdered()
1002 * if there is only one single code point to process;
1003 * this is indicated with p==NULL, and (c, c2) is the character to
1004 * insert
1005 * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
1006 * for a supplementary character)
1007 * otherwise, p[length] is merged in with _mergeOrdered()
1008 */
1009 if(isNorm32HangulOrJamo(norm32)) {
1010 if(nx_contains(nx, c)) {
1011 c2=0;
1012 p=null;
1013 length=1;
1014 } else {
1015 // Hangul syllable: decompose algorithmically
1016 p=buffer;
1017 pStart=0;
1018 cc=trailCC=0;
1019
1020 c-=HANGUL_BASE;
1021
1022 c2=(char)(c%JAMO_T_COUNT);
1023 c/=JAMO_T_COUNT;
1024 if(c2>0) {
1025 buffer[2]=(char)(JAMO_T_BASE+c2);
1026 length=3;
1027 } else {
1028 length=2;
1029 }
1030
1031 buffer[1]=(char)(JAMO_V_BASE+c%JAMO_V_COUNT);
1032 buffer[0]=(char)(JAMO_L_BASE+c/JAMO_V_COUNT);
1033 }
1034 } else {
1035 if(isNorm32Regular(norm32)) {
1036 c2=0;
1037 length=1;
1038 } else {
1039 // c is a lead surrogate, get the real norm32
1040 if(srcIndex!=srcLimit &&
1041 UTF16.isTrailSurrogate(c2=src[srcIndex])) {
1042 ++srcIndex;
1043 length=2;
1044 norm32=getNorm32FromSurrogatePair(norm32, c2);
1045 } else {
1046 c2=0;
1047 length=1;
1048 norm32=0;
1049 }
1050 }
1051
1052 /* get the decomposition and the lead and trail cc's */
1053 if(nx_contains(nx, c, c2)) {
1054 /* excluded: norm32==0 */
1055 cc=trailCC=0;
1056 p=null;
1057 } else if((norm32&qcMask)==0) {
1058 /* c does not decompose */
1059 cc=trailCC=(int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT));
1060 p=null;
1061 pStart=-1;
1062 } else {
1063 DecomposeArgs arg = new DecomposeArgs();
1064 /* c decomposes, get everything from the variable-length
1065 * extra data
1066 */
1067 pStart=decompose(norm32, qcMask, arg);
1068 p=extraData;
1069 length=arg.length;
1070 cc=arg.cc;
1071 trailCC=arg.trailCC;
1072 if(length==1) {
1073 /* fastpath a single code unit from decomposition */
1074 c=p[pStart];
1075 c2=0;
1076 p=null;
1077 pStart=-1;
1078 }
1079 }
1080 }
1081
1082 /* append the decomposition to the destination buffer, assume
1083 * length>0
1084 */
1085 if((destIndex+length)<=destLimit) {
1086 int reorderSplit=destIndex;
1087 if(p==null) {
1088 /* fastpath: single code point */
1089 if(cc!=0 && cc<prevCC) {
1090 /* (c, c2) is out of order with respect to the preceding
1091 * text
1092 */
1093 destIndex+=length;
1094 trailCC=insertOrdered(dest,reorderStartIndex,
1095 reorderSplit, destIndex, c, c2, cc);
1096 } else {
1097 /* just append (c, c2) */
1098 dest[destIndex++]=c;
1099 if(c2!=0) {
1100 dest[destIndex++]=c2;
1101 }
1102 }
1103 } else {
1104 /* general: multiple code points (ordered by themselves)
1105 * from decomposition
1106 */
1107 if(cc!=0 && cc<prevCC) {
1108 /* the decomposition is out of order with respect to the
1109 * preceding text
1110 */
1111 destIndex+=length;
1112 trailCC=mergeOrdered(dest,reorderStartIndex,
1113 reorderSplit,p, pStart,pStart+length);
1114 } else {
1115 /* just append the decomposition */
1116 do {
1117 dest[destIndex++]=p[pStart++];
1118 } while(--length>0);
1119 }
1120 }
1121 } else {
1122 /* buffer overflow */
1123 /* keep incrementing the destIndex for preflighting */
1124 destIndex+=length;
1125 }
1126
1127 prevCC=trailCC;
1128 if(prevCC==0) {
1129 reorderStartIndex=destIndex;
1130 }
1131 }
1132
1133 outTrailCC[0]=prevCC;
1134
1135 return destIndex - destStart;
1136 }
1137
1138 /* make NFC & NFKC ------------------------------------------------------ */
1139 private static final class NextCombiningArgs{
1140 char[] source;
1141 int start;
1142 //int limit;
1143 char c;
1144 char c2;
1145 int/*unsigned*/ combiningIndex;
1146 char /*unsigned byte*/ cc;
1147 }
1148
1149 /* get the composition properties of the next character */
1150 private static int /*unsigned*/ getNextCombining(NextCombiningArgs args,
1151 int limit,
1152 UnicodeSet nx) {
1153 long/*unsigned*/ norm32;
1154 int combineFlags;
1155 /* get properties */
1156 args.c=args.source[args.start++];
1157 norm32=getNorm32(args.c);
1158
1159 /* preset output values for most characters */
1160 args.c2=0;
1161 args.combiningIndex=0;
1162 args.cc=0;
1163
1164 if((norm32&(CC_MASK|COMBINES_ANY))==0) {
1165 return 0;
1166 } else {
1167 if(isNorm32Regular(norm32)) {
1168 /* set cc etc. below */
1169 } else if(isNorm32HangulOrJamo(norm32)) {
1170 /* a compatibility decomposition contained Jamos */
1171 args.combiningIndex=(int)((UNSIGNED_INT_MASK)&(0xfff0|
1172 (norm32>>EXTRA_SHIFT)));
1173 return (int)(norm32&COMBINES_ANY);
1174 } else {
1175 /* c is a lead surrogate, get the real norm32 */
1176 if(args.start!=limit && UTF16.isTrailSurrogate(args.c2=
1177 args.source[args.start])) {
1178 ++args.start;
1179 norm32=getNorm32FromSurrogatePair(norm32, args.c2);
1180 } else {
1181 args.c2=0;
1182 return 0;
1183 }
1184 }
1185
1186 if(nx_contains(nx, args.c, args.c2)) {
1187 return 0; /* excluded: norm32==0 */
1188 }
1189
1190 args.cc= (char)((norm32>>CC_SHIFT)&0xff);
1191
1192 combineFlags=(int)(norm32&COMBINES_ANY);
1193 if(combineFlags!=0) {
1194 int index = getExtraDataIndex(norm32);
1195 args.combiningIndex=index>0 ? extraData[(index-1)] :0;
1196 }
1197
1198 return combineFlags;
1199 }
1200 }
1201
1202 /*
1203 * given a composition-result starter (c, c2) - which means its cc==0,
1204 * it combines forward, it has extra data, its norm32!=0,
1205 * it is not a Hangul or Jamo,
1206 * get just its combineFwdIndex
1207 *
1208 * norm32(c) is special if and only if c2!=0
1209 */
1210 private static int/*unsigned*/ getCombiningIndexFromStarter(char c,char c2){
1211 long/*unsigned*/ norm32;
1212
1213 norm32=getNorm32(c);
1214 if(c2!=0) {
1215 norm32=getNorm32FromSurrogatePair(norm32, c2);
1216 }
1217 return extraData[(getExtraDataIndex(norm32)-1)];
1218 }
1219
1220 /*
1221 * Find the recomposition result for
1222 * a forward-combining character
1223 * (specified with a pointer to its part of the combiningTable[])
1224 * and a backward-combining character
1225 * (specified with its combineBackIndex).
1226 *
1227 * If these two characters combine, then set (value, value2)
1228 * with the code unit(s) of the composition character.
1229 *
1230 * Return value:
1231 * 0 do not combine
1232 * 1 combine
1233 * >1 combine, and the composition is a forward-combining starter
1234 *
1235 * See unormimp.h for a description of the composition table format.
1236 */
1237 private static int/*unsigned*/ combine(char[]table,int tableStart,
1238 int/*unsinged*/ combineBackIndex,
1239 int[] outValues) {
1240 int/*unsigned*/ key;
1241 int value,value2;
1242
1243 if(outValues.length<2){
1244 throw new IllegalArgumentException();
1245 }
1246
1247 /* search in the starter's composition table */
1248 for(;;) {
1249 key=table[tableStart++];
1250 if(key>=combineBackIndex) {
1251 break;
1252 }
1253 tableStart+= ((table[tableStart]&0x8000) != 0)? 2 : 1;
1254 }
1255
1256 /* mask off bit 15, the last-entry-in-the-list flag */
1257 if((key&0x7fff)==combineBackIndex) {
1258 /* found! combine! */
1259 value=table[tableStart];
1260
1261 /* is the composition a starter that combines forward? */
1262 key=(int)((UNSIGNED_INT_MASK)&((value&0x2000)+1));
1263
1264 /* get the composition result code point from the variable-length
1265 * result value
1266 */
1267 if((value&0x8000) != 0) {
1268 if((value&0x4000) != 0) {
1269 /* surrogate pair composition result */
1270 value=(int)((UNSIGNED_INT_MASK)&((value&0x3ff)|0xd800));
1271 value2=table[tableStart+1];
1272 } else {
1273 /* BMP composition result U+2000..U+ffff */
1274 value=table[tableStart+1];
1275 value2=0;
1276 }
1277 } else {
1278 /* BMP composition result U+0000..U+1fff */
1279 value&=0x1fff;
1280 value2=0;
1281 }
1282 outValues[0]=value;
1283 outValues[1]=value2;
1284 return key;
1285 } else {
1286 /* not found */
1287 return 0;
1288 }
1289 }
1290
1291
1292 private static final class RecomposeArgs{
1293 char[] source;
1294 int start;
1295 int limit;
1296 }
1297 /*
1298 * recompose the characters in [p..limit[
1299 * (which is in NFD - decomposed and canonically ordered),
1300 * adjust limit, and return the trailing cc
1301 *
1302 * since for NFKC we may get Jamos in decompositions, we need to
1303 * recompose those too
1304 *
1305 * note that recomposition never lengthens the text:
1306 * any character consists of either one or two code units;
1307 * a composition may contain at most one more code unit than the original
1308 * starter, while the combining mark that is removed has at least one code
1309 * unit
1310 */
1311 private static char/*unsigned byte*/ recompose(RecomposeArgs args, int options, UnicodeSet nx) {
1312 int remove, q, r;
1313 int /*unsigned*/ combineFlags;
1314 int /*unsigned*/ combineFwdIndex, combineBackIndex;
1315 int /*unsigned*/ result, value=0, value2=0;
1316 int /*unsigned byte*/ prevCC;
1317 boolean starterIsSupplementary;
1318 int starter;
1319 int[] outValues = new int[2];
1320 starter=-1; /* no starter */
1321 combineFwdIndex=0; /* will not be used until starter!=NULL */
1322 starterIsSupplementary=false; /* will not be used until starter!=NULL */
1323 prevCC=0;
1324
1325 NextCombiningArgs ncArg = new NextCombiningArgs();
1326 ncArg.source = args.source;
1327
1328 ncArg.cc =0;
1329 ncArg.c2 =0;
1330
1331 for(;;) {
1332 ncArg.start = args.start;
1333 combineFlags=getNextCombining(ncArg,args.limit,nx);
1334 combineBackIndex=ncArg.combiningIndex;
1335 args.start = ncArg.start;
1336
1337 if(((combineFlags&COMBINES_BACK)!=0) && starter!=-1) {
1338 if((combineBackIndex&0x8000)!=0) {
1339 /* c is a Jamo V/T, see if we can compose it with the
1340 * previous character
1341 */
1342 /* for the PRI #29 fix, check that there is no intervening combining mark */
1343 if((options&BEFORE_PRI_29)!=0 || prevCC==0) {
1344 remove=-1; /* NULL while no Hangul composition */
1345 combineFlags=0;
1346 ncArg.c2=args.source[starter];
1347 if(combineBackIndex==0xfff2) {
1348 /* Jamo V, compose with previous Jamo L and following
1349 * Jamo T
1350 */
1351 ncArg.c2=(char)(ncArg.c2-JAMO_L_BASE);
1352 if(ncArg.c2<JAMO_L_COUNT) {
1353 remove=args.start-1;
1354 ncArg.c=(char)(HANGUL_BASE+(ncArg.c2*JAMO_V_COUNT+
1355 (ncArg.c-JAMO_V_BASE))*JAMO_T_COUNT);
1356 if(args.start!=args.limit &&
1357 (ncArg.c2=(char)(args.source[args.start]
1358 -JAMO_T_BASE))<JAMO_T_COUNT) {
1359 ++args.start;
1360 ncArg.c+=ncArg.c2;
1361 } else {
1362 /* the result is an LV syllable, which is a starter (unlike LVT) */
1363 combineFlags=COMBINES_FWD;
1364 }
1365 if(!nx_contains(nx, ncArg.c)) {
1366 args.source[starter]=ncArg.c;
1367 } else {
1368 /* excluded */
1369 if(!isHangulWithoutJamoT(ncArg.c)) {
1370 --args.start; /* undo the ++args.start from reading the Jamo T */
1371 }
1372 /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
1373 remove=args.start;
1374 }
1375 }
1376
1377 /*
1378 * Normally, the following can not occur:
1379 * Since the input is in NFD, there are no Hangul LV syllables that
1380 * a Jamo T could combine with.
1381 * All Jamo Ts are combined above when handling Jamo Vs.
1382 *
1383 * However, before the PRI #29 fix, this can occur due to
1384 * an intervening combining mark between the Hangul LV and the Jamo T.
1385 */
1386 } else {
1387 /* Jamo T, compose with previous Hangul that does not have a Jamo T */
1388 if(isHangulWithoutJamoT(ncArg.c2)) {
1389 ncArg.c2+=ncArg.c-JAMO_T_BASE;
1390 if(!nx_contains(nx, ncArg.c2)) {
1391 remove=args.start-1;
1392 args.source[starter]=ncArg.c2;
1393 }
1394 }
1395 }
1396
1397 if(remove!=-1) {
1398 /* remove the Jamo(s) */
1399 q=remove;
1400 r=args.start;
1401 while(r<args.limit) {
1402 args.source[q++]=args.source[r++];
1403 }
1404 args.start=remove;
1405 args.limit=q;
1406 }
1407
1408 ncArg.c2=0; /* c2 held *starter temporarily */
1409
1410 if(combineFlags!=0) {
1411 /*
1412 * not starter=NULL because the composition is a Hangul LV syllable
1413 * and might combine once more (but only before the PRI #29 fix)
1414 */
1415
1416 /* done? */
1417 if(args.start==args.limit) {
1418 return (char)prevCC;
1419 }
1420
1421 /* the composition is a Hangul LV syllable which is a starter that combines forward */
1422 combineFwdIndex=0xfff0;
1423
1424 /* we combined; continue with looking for compositions */
1425 continue;
1426 }
1427 }
1428
1429 /*
1430 * now: cc==0 and the combining index does not include
1431 * "forward" -> the rest of the loop body will reset starter
1432 * to NULL; technically, a composed Hangul syllable is a
1433 * starter, but it does not combine forward now that we have
1434 * consumed all eligible Jamos; for Jamo V/T, combineFlags
1435 * does not contain _NORM_COMBINES_FWD
1436 */
1437
1438 } else if(
1439 /* the starter is not a Hangul LV or Jamo V/T and */
1440 !((combineFwdIndex&0x8000)!=0) &&
1441 /* the combining mark is not blocked and */
1442 ((options&BEFORE_PRI_29)!=0 ?
1443 (prevCC!=ncArg.cc || prevCC==0) :
1444 (prevCC<ncArg.cc || prevCC==0)) &&
1445 /* the starter and the combining mark (c, c2) do combine */
1446 0!=(result=combine(combiningTable,combineFwdIndex,
1447 combineBackIndex, outValues)) &&
1448 /* the composition result is not excluded */
1449 !nx_contains(nx, (char)value, (char)value2)
1450 ) {
1451 value=outValues[0];
1452 value2=outValues[1];
1453 /* replace the starter with the composition, remove the
1454 * combining mark
1455 */
1456 remove= ncArg.c2==0 ? args.start-1 : args.start-2; /* index to the combining mark */
1457
1458 /* replace the starter with the composition */
1459 args.source[starter]=(char)value;
1460 if(starterIsSupplementary) {
1461 if(value2!=0) {
1462 /* both are supplementary */
1463 args.source[starter+1]=(char)value2;
1464 } else {
1465 /* the composition is shorter than the starter,
1466 * move the intermediate characters forward one */
1467 starterIsSupplementary=false;
1468 q=starter+1;
1469 r=q+1;
1470 while(r<remove) {
1471 args.source[q++]=args.source[r++];
1472 }
1473 --remove;
1474 }
1475 } else if(value2!=0) { // for U+1109A, U+1109C, and U+110AB
1476 starterIsSupplementary=true;
1477 args.source[starter+1]=(char)value2;
1478 /* } else { both are on the BMP, nothing more to do */
1479 }
1480
1481 /* remove the combining mark by moving the following text
1482 * over it */
1483 if(remove<args.start) {
1484 q=remove;
1485 r=args.start;
1486 while(r<args.limit) {
1487 args.source[q++]=args.source[r++];
1488 }
1489 args.start=remove;
1490 args.limit=q;
1491 }
1492
1493 /* keep prevCC because we removed the combining mark */
1494
1495 /* done? */
1496 if(args.start==args.limit) {
1497 return (char)prevCC;
1498 }
1499
1500 /* is the composition a starter that combines forward? */
1501 if(result>1) {
1502 combineFwdIndex=getCombiningIndexFromStarter((char)value,
1503 (char)value2);
1504 } else {
1505 starter=-1;
1506 }
1507
1508 /* we combined; continue with looking for compositions */
1509 continue;
1510 }
1511 }
1512
1513 /* no combination this time */
1514 prevCC=ncArg.cc;
1515 if(args.start==args.limit) {
1516 return (char)prevCC;
1517 }
1518
1519 /* if (c, c2) did not combine, then check if it is a starter */
1520 if(ncArg.cc==0) {
1521 /* found a new starter; combineFlags==0 if (c, c2) is excluded */
1522 if((combineFlags&COMBINES_FWD)!=0) {
1523 /* it may combine with something, prepare for it */
1524 if(ncArg.c2==0) {
1525 starterIsSupplementary=false;
1526 starter=args.start-1;
1527 } else {
1528 starterIsSupplementary=false;
1529 starter=args.start-2;
1530 }
1531 combineFwdIndex=combineBackIndex;
1532 } else {
1533 /* it will not combine with anything */
1534 starter=-1;
1535 }
1536 } else if((options&OPTIONS_COMPOSE_CONTIGUOUS)!=0) {
1537 /* FCC: no discontiguous compositions; any intervening character blocks */
1538 starter=-1;
1539 }
1540 }
1541 }
1542
1543 // find the last true starter between src[start]....src[current] going
1544 // backwards and return its index
1545 private static int findPreviousStarter(char[]src, int srcStart, int current,
1546 int/*unsigned*/ ccOrQCMask,
1547 int/*unsigned*/ decompQCMask,
1548 char minNoMaybe) {
1549 long norm32;
1550 PrevArgs args = new PrevArgs();
1551 args.src = src;
1552 args.start = srcStart;
1553 args.current = current;
1554
1555 while(args.start<args.current) {
1556 norm32= getPrevNorm32(args, minNoMaybe, ccOrQCMask|decompQCMask);
1557 if(isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
1558 break;
1559 }
1560 }
1561 return args.current;
1562 }
1563
1564 /* find the first true starter in [src..limit[ and return the
1565 * pointer to it
1566 */
1567 private static int/*index*/ findNextStarter(char[] src,int start,int limit,
1568 int/*unsigned*/ qcMask,
1569 int/*unsigned*/ decompQCMask,
1570 char minNoMaybe) {
1571 int p;
1572 long/*unsigned*/ norm32;
1573 int ccOrQCMask;
1574 char c, c2;
1575
1576 ccOrQCMask=CC_MASK|qcMask;
1577
1578 DecomposeArgs decompArgs = new DecomposeArgs();
1579
1580 for(;;) {
1581 if(start==limit) {
1582 break; /* end of string */
1583 }
1584 c=src[start];
1585 if(c<minNoMaybe) {
1586 break; /* catches NUL terminater, too */
1587 }
1588
1589 norm32=getNorm32(c);
1590 if((norm32&ccOrQCMask)==0) {
1591 break; /* true starter */
1592 }
1593
1594 if(isNorm32LeadSurrogate(norm32)) {
1595 /* c is a lead surrogate, get the real norm32 */
1596 if((start+1)==limit ||
1597 !UTF16.isTrailSurrogate(c2=(src[start+1]))){
1598 /* unmatched first surrogate: counts as a true starter */
1599 break;
1600 }
1601 norm32=getNorm32FromSurrogatePair(norm32, c2);
1602
1603 if((norm32&ccOrQCMask)==0) {
1604 break; /* true starter */
1605 }
1606 } else {
1607 c2=0;
1608 }
1609
1610 /* (c, c2) is not a true starter but its decomposition may be */
1611 if((norm32&decompQCMask)!=0) {
1612 /* (c, c2) decomposes, get everything from the variable-length
1613 * extra data */
1614 p=decompose(norm32, decompQCMask, decompArgs);
1615
1616 /* get the first character's norm32 to check if it is a true
1617 * starter */
1618 if(decompArgs.cc==0 && (getNorm32(extraData,p, qcMask)&qcMask)==0) {
1619 break; /* true starter */
1620 }
1621 }
1622
1623 start+= c2==0 ? 1 : 2; /* not a true starter, continue */
1624 }
1625
1626 return start;
1627 }
1628
1629
1630 private static final class ComposePartArgs{
1631 int prevCC;
1632 int length; /* length of decomposed part */
1633 }
1634
1635 /* decompose and recompose [prevStarter..src[ */
1636 private static char[] composePart(ComposePartArgs args,
1637 int prevStarter,
1638 char[] src, int start, int limit,
1639 int options,
1640 UnicodeSet nx) {
1641 int recomposeLimit;
1642 boolean compat =((options&OPTIONS_COMPAT)!=0);
1643
1644 /* decompose [prevStarter..src[ */
1645 int[] outTrailCC = new int[1];
1646 char[] buffer = new char[(limit-prevStarter)*MAX_BUFFER_SIZE];
1647
1648 for(;;){
1649 args.length=decompose(src,prevStarter,(start),
1650 buffer,0,buffer.length,
1651 compat,outTrailCC,nx);
1652 if(args.length<=buffer.length){
1653 break;
1654 }else{
1655 buffer = new char[args.length];
1656 }
1657 }
1658
1659 /* recompose the decomposition */
1660 recomposeLimit=args.length;
1661
1662 if(args.length>=2) {
1663 RecomposeArgs rcArgs = new RecomposeArgs();
1664 rcArgs.source = buffer;
1665 rcArgs.start = 0;
1666 rcArgs.limit = recomposeLimit;
1667 args.prevCC=recompose(rcArgs, options, nx);
1668 recomposeLimit = rcArgs.limit;
1669 }
1670
1671 /* return with a pointer to the recomposition and its length */
1672 args.length=recomposeLimit;
1673 return buffer;
1674 }
1675
1676 private static boolean composeHangul(char prev, char c,
1677 long/*unsigned*/ norm32,
1678 char[] src,int[] srcIndex, int limit,
1679 boolean compat,
1680 char[] dest,int destIndex,
1681 UnicodeSet nx) {
1682 int start=srcIndex[0];
1683 if(isJamoVTNorm32JamoV(norm32)) {
1684 /* c is a Jamo V, compose with previous Jamo L and
1685 * following Jamo T */
1686 prev=(char)(prev-JAMO_L_BASE);
1687 if(prev<JAMO_L_COUNT) {
1688 c=(char)(HANGUL_BASE+(prev*JAMO_V_COUNT+
1689 (c-JAMO_V_BASE))*JAMO_T_COUNT);
1690
1691 /* check if the next character is a Jamo T (normal or
1692 * compatibility) */
1693 if(start!=limit) {
1694 char next, t;
1695
1696 next=src[start];
1697 if((t=(char)(next-JAMO_T_BASE))<JAMO_T_COUNT) {
1698 /* normal Jamo T */
1699 ++start;
1700 c+=t;
1701 } else if(compat) {
1702 /* if NFKC, then check for compatibility Jamo T
1703 * (BMP only) */
1704 norm32=getNorm32(next);
1705 if(isNorm32Regular(norm32) && ((norm32&QC_NFKD)!=0)) {
1706 int p /*index into extra data array*/;
1707 DecomposeArgs dcArgs = new DecomposeArgs();
1708 p=decompose(norm32, QC_NFKD, dcArgs);
1709 if(dcArgs.length==1 &&
1710 (t=(char)(extraData[p]-JAMO_T_BASE))
1711 <JAMO_T_COUNT) {
1712 /* compatibility Jamo T */
1713 ++start;
1714 c+=t;
1715 }
1716 }
1717 }
1718 }
1719 if(nx_contains(nx, c)) {
1720 if(!isHangulWithoutJamoT(c)) {
1721 --start; /* undo ++start from reading the Jamo T */
1722 }
1723 return false;
1724 }
1725 dest[destIndex]=c;
1726 srcIndex[0]=start;
1727 return true;
1728 }
1729 } else if(isHangulWithoutJamoT(prev)) {
1730 /* c is a Jamo T, compose with previous Hangul LV that does not
1731 * contain a Jamo T */
1732 c=(char)(prev+(c-JAMO_T_BASE));
1733 if(nx_contains(nx, c)) {
1734 return false;
1735 }
1736 dest[destIndex]=c;
1737 srcIndex[0]=start;
1738 return true;
1739 }
1740 return false;
1741 }
1742 /*
1743 public static int compose(char[] src, char[] dest,boolean compat, UnicodeSet nx){
1744 return compose(src,0,src.length,dest,0,dest.length,compat, nx);
1745 }
1746 */
1747
1748 public static int compose(char[] src, int srcStart, int srcLimit,
1749 char[] dest,int destStart,int destLimit,
1750 int options,UnicodeSet nx) {
1751
1752 int prevSrc, prevStarter;
1753 long/*unsigned*/ norm32;
1754 int ccOrQCMask, qcMask;
1755 int reorderStartIndex, length;
1756 char c, c2, minNoMaybe;
1757 int/*unsigned byte*/ cc, prevCC;
1758 int[] ioIndex = new int[1];
1759 int destIndex = destStart;
1760 int srcIndex = srcStart;
1761
1762 if((options&OPTIONS_COMPAT)!=0) {
1763 minNoMaybe=(char)indexes[INDEX_MIN_NFKC_NO_MAYBE];
1764 qcMask=QC_NFKC;
1765 } else {
1766 minNoMaybe=(char)indexes[INDEX_MIN_NFC_NO_MAYBE];
1767 qcMask=QC_NFC;
1768 }
1769
1770 /*
1771 * prevStarter points to the last character before the current one
1772 * that is a "true" starter with cc==0 and quick check "yes".
1773 *
1774 * prevStarter will be used instead of looking for a true starter
1775 * while incrementally decomposing [prevStarter..prevSrc[
1776 * in _composePart(). Having a good prevStarter allows to just decompose
1777 * the entire [prevStarter..prevSrc[.
1778 *
1779 * When _composePart() backs out from prevSrc back to prevStarter,
1780 * then it also backs out destIndex by the same amount.
1781 * Therefore, at all times, the (prevSrc-prevStarter) source units
1782 * must correspond 1:1 to destination units counted with destIndex,
1783 * except for reordering.
1784 * This is true for the qc "yes" characters copied in the fast loop,
1785 * and for pure reordering.
1786 * prevStarter must be set forward to src when this is not true:
1787 * In _composePart() and after composing a Hangul syllable.
1788 *
1789 * This mechanism relies on the assumption that the decomposition of a
1790 * true starter also begins with a true starter. gennorm/store.c checks
1791 * for this.
1792 */
1793 prevStarter=srcIndex;
1794
1795 ccOrQCMask=CC_MASK|qcMask;
1796 /*destIndex=*/reorderStartIndex=0;/* ####TODO#### check this **/
1797 prevCC=0;
1798
1799 /* avoid compiler warnings */
1800 norm32=0;
1801 c=0;
1802
1803 for(;;) {
1804 /* count code units below the minimum or with irrelevant data for
1805 * the quick check */
1806 prevSrc=srcIndex;
1807
1808 while(srcIndex!=srcLimit && ((c=src[srcIndex])<minNoMaybe ||
1809 ((norm32=getNorm32(c))&ccOrQCMask)==0)) {
1810 prevCC=0;
1811 ++srcIndex;
1812 }
1813
1814
1815 /* copy these code units all at once */
1816 if(srcIndex!=prevSrc) {
1817 length=(int)(srcIndex-prevSrc);
1818 if((destIndex+length)<=destLimit) {
1819 System.arraycopy(src,prevSrc,dest,destIndex,length);
1820 }
1821 destIndex+=length;
1822 reorderStartIndex=destIndex;
1823
1824 /* set prevStarter to the last character in the quick check
1825 * loop */
1826 prevStarter=srcIndex-1;
1827 if(UTF16.isTrailSurrogate(src[prevStarter]) &&
1828 prevSrc<prevStarter &&
1829 UTF16.isLeadSurrogate(src[(prevStarter-1)])) {
1830 --prevStarter;
1831 }
1832
1833 prevSrc=srcIndex;
1834 }
1835
1836 /* end of source reached? */
1837 if(srcIndex==srcLimit) {
1838 break;
1839 }
1840
1841 /* c already contains *src and norm32 is set for it, increment src*/
1842 ++srcIndex;
1843
1844 /*
1845 * source buffer pointers:
1846 *
1847 * all done quick check current char not yet
1848 * "yes" but (c, c2) processed
1849 * may combine
1850 * forward
1851 * [-------------[-------------[-------------[-------------[
1852 * | | | | |
1853 * start prevStarter prevSrc src limit
1854 *
1855 *
1856 * destination buffer pointers and indexes:
1857 *
1858 * all done might take not filled yet
1859 * characters for
1860 * reordering
1861 * [-------------[-------------[-------------[
1862 * | | | |
1863 * dest reorderStartIndex destIndex destCapacity
1864 */
1865
1866 /* check one above-minimum, relevant code unit */
1867 /*
1868 * norm32 is for c=*(src-1), and the quick check flag is "no" or
1869 * "maybe", and/or cc!=0
1870 * check for Jamo V/T, then for surrogates and regular characters
1871 * c is not a Hangul syllable or Jamo L because
1872 * they are not marked with no/maybe for NFC & NFKC(and their cc==0)
1873 */
1874 if(isNorm32HangulOrJamo(norm32)) {
1875 /*
1876 * c is a Jamo V/T:
1877 * try to compose with the previous character, Jamo V also with
1878 * a following Jamo T, and set values here right now in case we
1879 * just continue with the main loop
1880 */
1881 prevCC=cc=0;
1882 reorderStartIndex=destIndex;
1883 ioIndex[0]=srcIndex;
1884 if(
1885 destIndex>0 &&
1886 composeHangul(src[(prevSrc-1)], c, norm32,src, ioIndex,
1887 srcLimit, (options&OPTIONS_COMPAT)!=0, dest,
1888 destIndex<=destLimit ? destIndex-1: 0,
1889 nx)
1890 ) {
1891 srcIndex=ioIndex[0];
1892 prevStarter=srcIndex;
1893 continue;
1894 }
1895
1896 srcIndex = ioIndex[0];
1897
1898 /* the Jamo V/T did not compose into a Hangul syllable, just
1899 * append to dest */
1900 c2=0;
1901 length=1;
1902 prevStarter=prevSrc;
1903 } else {
1904 if(isNorm32Regular(norm32)) {
1905 c2=0;
1906 length=1;
1907 } else {
1908 /* c is a lead surrogate, get the real norm32 */
1909 if(srcIndex!=srcLimit &&
1910 UTF16.isTrailSurrogate(c2=src[srcIndex])) {
1911 ++srcIndex;
1912 length=2;
1913 norm32=getNorm32FromSurrogatePair(norm32, c2);
1914 } else {
1915 /* c is an unpaired lead surrogate, nothing to do */
1916 c2=0;
1917 length=1;
1918 norm32=0;
1919 }
1920 }
1921 ComposePartArgs args =new ComposePartArgs();
1922
1923 /* we are looking at the character (c, c2) at [prevSrc..src[ */
1924 if(nx_contains(nx, c, c2)) {
1925 /* excluded: norm32==0 */
1926 cc=0;
1927 } else if((norm32&qcMask)==0) {
1928 cc=(int)((UNSIGNED_BYTE_MASK)&(norm32>>CC_SHIFT));
1929 } else {
1930 char[] p;
1931
1932 /*
1933 * find appropriate boundaries around this character,
1934 * decompose the source text from between the boundaries,
1935 * and recompose it
1936 *
1937 * this puts the intermediate text into the side buffer because
1938 * it might be longer than the recomposition end result,
1939 * or the destination buffer may be too short or missing
1940 *
1941 * note that destIndex may be adjusted backwards to account
1942 * for source text that passed the quick check but needed to
1943 * take part in the recomposition
1944 */
1945 int decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
1946 /*
1947 * find the last true starter in [prevStarter..src[
1948 * it is either the decomposition of the current character (at prevSrc),
1949 * or prevStarter
1950 */
1951 if(isTrueStarter(norm32, CC_MASK|qcMask, decompQCMask)) {
1952 prevStarter=prevSrc;
1953 } else {
1954 /* adjust destIndex: back out what had been copied with qc "yes" */
1955 destIndex-=prevSrc-prevStarter;
1956 }
1957
1958 /* find the next true starter in [src..limit[ */
1959 srcIndex=findNextStarter(src, srcIndex,srcLimit, qcMask,
1960 decompQCMask, minNoMaybe);
1961 //args.prevStarter = prevStarter;
1962 args.prevCC = prevCC;
1963 //args.destIndex = destIndex;
1964 args.length = length;
1965 p=composePart(args,prevStarter,src,srcIndex,srcLimit,options,nx);
1966
1967 if(p==null) {
1968 /* an error occurred (out of memory) */
1969 break;
1970 }
1971
1972 prevCC = args.prevCC;
1973 length = args.length;
1974
1975 /* append the recomposed buffer contents to the destination
1976 * buffer */
1977 if((destIndex+args.length)<=destLimit) {
1978 int i=0;
1979 while(i<args.length) {
1980 dest[destIndex++]=p[i++];
1981 --length;
1982 }
1983 } else {
1984 /* buffer overflow */
1985 /* keep incrementing the destIndex for preflighting */
1986 destIndex+=length;
1987 }
1988
1989 prevStarter=srcIndex;
1990 continue;
1991 }
1992 }
1993
1994 /* append the single code point (c, c2) to the destination buffer */
1995 if((destIndex+length)<=destLimit) {
1996 if(cc!=0 && cc<prevCC) {
1997 /* (c, c2) is out of order with respect to the preceding
1998 * text */
1999 int reorderSplit= destIndex;
2000 destIndex+=length;
2001 prevCC=insertOrdered(dest,reorderStartIndex, reorderSplit,
2002 destIndex, c, c2, cc);
2003 } else {
2004 /* just append (c, c2) */
2005 dest[destIndex++]=c;
2006 if(c2!=0) {
2007 dest[destIndex++]=c2;
2008 }
2009 prevCC=cc;
2010 }
2011 } else {
2012 /* buffer overflow */
2013 /* keep incrementing the destIndex for preflighting */
2014 destIndex+=length;
2015 prevCC=cc;
2016 }
2017 }
2018
2019 return destIndex - destStart;
2020 }
2021
2022 public static int getCombiningClass(int c) {
2023 long norm32;
2024 norm32=getNorm32(c);
2025 return (char)((norm32>>CC_SHIFT)&0xFF);
2026 }
2027
2028 public static boolean isFullCompositionExclusion(int c) {
2029 if(isFormatVersion_2_1) {
2030 int aux =AuxTrieImpl.auxTrie.getCodePointValue(c);
2031 return (boolean)((aux & AUX_COMP_EX_MASK)!=0);
2032 } else {
2033 return false;
2034 }
2035 }
2036
2037 public static boolean isCanonSafeStart(int c) {
2038 if(isFormatVersion_2_1) {
2039 int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
2040 return (boolean)((aux & AUX_UNSAFE_MASK)==0);
2041 } else {
2042 return false;
2043 }
2044 }
2045
2046 /* Is c an NF<mode>-skippable code point? See unormimp.h. */
2047 public static boolean isNFSkippable(int c, NormalizerBase.Mode mode, long mask) {
2048 long /*unsigned int*/ norm32;
2049 mask = mask & UNSIGNED_INT_MASK;
2050 char aux;
2051
2052 /* check conditions (a)..(e), see unormimp.h */
2053 norm32 = getNorm32(c);
2054
2055 if((norm32&mask)!=0) {
2056 return false; /* fails (a)..(e), not skippable */
2057 }
2058
2059 if(mode == NormalizerBase.NFD || mode == NormalizerBase.NFKD || mode == NormalizerBase.NONE){
2060 return true; /* NF*D, passed (a)..(c), is skippable */
2061 }
2062 /* check conditions (a)..(e), see unormimp.h */
2063
2064 /* NF*C/FCC, passed (a)..(e) */
2065 if((norm32& QC_NFD)==0) {
2066 return true; /* no canonical decomposition, is skippable */
2067 }
2068
2069 /* check Hangul syllables algorithmically */
2070 if(isNorm32HangulOrJamo(norm32)) {
2071 /* Jamo passed (a)..(e) above, must be Hangul */
2072 return !isHangulWithoutJamoT((char)c); /* LVT are skippable, LV are not */
2073 }
2074
2075 /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
2076 /* NF*C, test (f) flag */
2077 if(!isFormatVersion_2_2) {
2078 return false; /* no (f) data, say not skippable to be safe */
2079 }
2080
2081
2082 aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
2083 return (aux&AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */
2084
2085 /* } else { FCC, test fcd<=1 instead of the above } */
2086 }
2087
2088 public static UnicodeSet addPropertyStarts(UnicodeSet set) {
2089 int c;
2090
2091 /* add the start code point of each same-value range of each trie */
2092 //utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set);
2093 TrieIterator normIter = new TrieIterator(NormTrieImpl.normTrie);
2094 RangeValueIterator.Element normResult = new RangeValueIterator.Element();
2095
2096 while(normIter.next(normResult)){
2097 set.add(normResult.start);
2098 }
2099
2100 //utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
2101 TrieIterator fcdIter = new TrieIterator(FCDTrieImpl.fcdTrie);
2102 RangeValueIterator.Element fcdResult = new RangeValueIterator.Element();
2103
2104 while(fcdIter.next(fcdResult)){
2105 set.add(fcdResult.start);
2106 }
2107
2108 if(isFormatVersion_2_1){
2109 //utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set);
2110 TrieIterator auxIter = new TrieIterator(AuxTrieImpl.auxTrie);
2111 RangeValueIterator.Element auxResult = new RangeValueIterator.Element();
2112 while(auxIter.next(auxResult)){
2113 set.add(auxResult.start);
2114 }
2115 }
2116 /* add Hangul LV syllables and LV+1 because of skippables */
2117 for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
2118 set.add(c);
2119 set.add(c+1);
2120 }
2121 set.add(HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
2122 return set; // for chaining
2123 }
2124
2125 /**
2126 * Internal API, used in UCharacter.getIntPropertyValue().
2127 * @internal
2128 * @param c code point
2129 * @param modeValue numeric value compatible with Mode
2130 * @return numeric value compatible with QuickCheck
2131 */
2132 public static final int quickCheck(int c, int modeValue) {
2133 final int qcMask[/*UNORM_MODE_COUNT*/]={
2134 0, 0, QC_NFD, QC_NFKD, QC_NFC, QC_NFKC
2135 };
2136
2137 int norm32=(int)getNorm32(c)&qcMask[modeValue];
2138
2139 if(norm32==0) {
2140 return 1; // YES
2141 } else if((norm32&QC_ANY_NO)!=0) {
2142 return 0; // NO
2143 } else /* _NORM_QC_ANY_MAYBE */ {
2144 return 2; // MAYBE;
2145 }
2146 }
2147
2148 private static int strCompare(char[] s1, int s1Start, int s1Limit,
2149 char[] s2, int s2Start, int s2Limit,
2150 boolean codePointOrder) {
2151
2152 int start1, start2, limit1, limit2;
2153
2154 char c1, c2;
2155
2156 /* setup for fix-up */
2157 start1=s1Start;
2158 start2=s2Start;
2159
2160 int length1, length2;
2161
2162 length1 = s1Limit - s1Start;
2163 length2 = s2Limit - s2Start;
2164
2165 int lengthResult;
2166
2167 if(length1<length2) {
2168 lengthResult=-1;
2169 limit1=start1+length1;
2170 } else if(length1==length2) {
2171 lengthResult=0;
2172 limit1=start1+length1;
2173 } else /* length1>length2 */ {
2174 lengthResult=1;
2175 limit1=start1+length2;
2176 }
2177
2178 if(s1==s2) {
2179 return lengthResult;
2180 }
2181
2182 for(;;) {
2183 /* check pseudo-limit */
2184 if(s1Start==limit1) {
2185 return lengthResult;
2186 }
2187
2188 c1=s1[s1Start];
2189 c2=s2[s2Start];
2190 if(c1!=c2) {
2191 break;
2192 }
2193 ++s1Start;
2194 ++s2Start;
2195 }
2196
2197 /* setup for fix-up */
2198 limit1=start1+length1;
2199 limit2=start2+length2;
2200
2201
2202 /* if both values are in or above the surrogate range, fix them up */
2203 if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
2204 /* subtract 0x2800 from BMP code points to make them smaller than
2205 * supplementary ones */
2206 if(
2207 ( c1<=0xdbff && (s1Start+1)!=limit1 &&
2208 UTF16.isTrailSurrogate(s1[(s1Start+1)])
2209 ) ||
2210 ( UTF16.isTrailSurrogate(c1) && start1!=s1Start &&
2211 UTF16.isLeadSurrogate(s1[(s1Start-1)])
2212 )
2213 ) {
2214 /* part of a surrogate pair, leave >=d800 */
2215 } else {
2216 /* BMP code point - may be surrogate code point - make <d800 */
2217 c1-=0x2800;
2218 }
2219
2220 if(
2221 ( c2<=0xdbff && (s2Start+1)!=limit2 &&
2222 UTF16.isTrailSurrogate(s2[(s2Start+1)])
2223 ) ||
2224 ( UTF16.isTrailSurrogate(c2) && start2!=s2Start &&
2225 UTF16.isLeadSurrogate(s2[(s2Start-1)])
2226 )
2227 ) {
2228 /* part of a surrogate pair, leave >=d800 */
2229 } else {
2230 /* BMP code point - may be surrogate code point - make <d800 */
2231 c2-=0x2800;
2232 }
2233 }
2234
2235 /* now c1 and c2 are in UTF-32-compatible order */
2236 return (int)c1-(int)c2;
2237 }
2238
2239
2240 /*
2241 * Status of tailored normalization
2242 *
2243 * This was done initially for investigation on Unicode public review issue 7
2244 * (http://www.unicode.org/review/). See Jitterbug 2481.
2245 * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
2246 * a permanent feature in ICU 2.6 in support of IDNA which requires true
2247 * Unicode 3.2 normalization.
2248 * (NormalizationCorrections are rolled into IDNA mapping tables.)
2249 *
2250 * Tailored normalization as implemented here allows to "normalize less"
2251 * than full Unicode normalization would.
2252 * Based internally on a UnicodeSet of code points that are
2253 * "excluded from normalization", the normalization functions leave those
2254 * code points alone ("inert"). This means that tailored normalization
2255 * still transforms text into a canonically equivalent form.
2256 * It does not add decompositions to code points that do not have any or
2257 * change decomposition results.
2258 *
2259 * Any function that searches for a safe boundary has not been touched,
2260 * which means that these functions will be over-pessimistic when
2261 * exclusions are applied.
2262 * This should not matter because subsequent checks and normalizations
2263 * do apply the exclusions; only a little more of the text may be processed
2264 * than necessary under exclusions.
2265 *
2266 * Normalization exclusions have the following effect on excluded code points c:
2267 * - c is not decomposed
2268 * - c is not a composition target
2269 * - c does not combine forward or backward for composition
2270 * except that this is not implemented for Jamo
2271 * - c is treated as having a combining class of 0
2272 */
2273
2274 /*
2275 * Constants for the bit fields in the options bit set parameter.
2276 * These need not be public.
2277 * A user only needs to know the currently assigned values.
2278 * The number and positions of reserved bits per field can remain private.
2279 */
2280 private static final int OPTIONS_NX_MASK=0x1f;
2281 private static final int OPTIONS_UNICODE_MASK=0xe0;
2282 public static final int OPTIONS_SETS_MASK=0xff;
2283 // private static final int OPTIONS_UNICODE_SHIFT=5;
2284 private static final UnicodeSet[] nxCache = new UnicodeSet[OPTIONS_SETS_MASK+1];
2285
2286 /* Constants for options flags for normalization.*/
2287
2288 /**
2289 * Options bit 0, do not decompose Hangul syllables.
2290 * @draft ICU 2.6
2291 */
2292 private static final int NX_HANGUL = 1;
2293 /**
2294 * Options bit 1, do not decompose CJK compatibility characters.
2295 * @draft ICU 2.6
2296 */
2297 private static final int NX_CJK_COMPAT=2;
2298 /**
2299 * Options bit 8, use buggy recomposition described in
2300 * Unicode Public Review Issue #29
2301 * at http://www.unicode.org/review/resolved-pri.html#pri29
2302 *
2303 * Used in IDNA implementation according to strict interpretation
2304 * of IDNA definition based on Unicode 3.2 which predates PRI #29.
2305 *
2306 * See ICU4C unormimp.h
2307 *
2308 * @draft ICU 3.2
2309 */
2310 public static final int BEFORE_PRI_29=0x100;
2311
2312 /*
2313 * The following options are used only in some composition functions.
2314 * They use bits 12 and up to preserve lower bits for the available options
2315 * space in unorm_compare() -
2316 * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
2317 */
2318
2319 /** Options bit 12, for compatibility vs. canonical decomposition. */
2320 public static final int OPTIONS_COMPAT=0x1000;
2321 /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
2322 public static final int OPTIONS_COMPOSE_CONTIGUOUS=0x2000;
2323
2324 /* normalization exclusion sets --------------------------------------------- */
2325
2326 /*
2327 * Normalization exclusion UnicodeSets are used for tailored normalization;
2328 * see the comment near the beginning of this file.
2329 *
2330 * By specifying one or several sets of code points,
2331 * those code points become inert for normalization.
2332 */
2333 private static final synchronized UnicodeSet internalGetNXHangul() {
2334 /* internal function, does not check for incoming U_FAILURE */
2335
2336 if(nxCache[NX_HANGUL]==null) {
2337 nxCache[NX_HANGUL]=new UnicodeSet(0xac00, 0xd7a3);
2338 }
2339 return nxCache[NX_HANGUL];
2340 }
2341
2342 private static final synchronized UnicodeSet internalGetNXCJKCompat() {
2343 /* internal function, does not check for incoming U_FAILURE */
2344
2345 if(nxCache[NX_CJK_COMPAT]==null) {
2346
2347 /* build a set from [CJK Ideographs]&[has canonical decomposition] */
2348 UnicodeSet set, hasDecomp;
2349
2350 set=new UnicodeSet("[:Ideographic:]");
2351
2352 /* start with an empty set for [has canonical decomposition] */
2353 hasDecomp=new UnicodeSet();
2354
2355 /* iterate over all ideographs and remember which canonically decompose */
2356 UnicodeSetIterator it = new UnicodeSetIterator(set);
2357 int start, end;
2358 long norm32;
2359
2360 while(it.nextRange() && (it.codepoint != UnicodeSetIterator.IS_STRING)) {
2361 start=it.codepoint;
2362 end=it.codepointEnd;
2363 while(start<=end) {
2364 norm32 = getNorm32(start);
2365 if((norm32 & QC_NFD)>0) {
2366 hasDecomp.add(start);
2367 }
2368 ++start;
2369 }
2370 }
2371
2372 /* hasDecomp now contains all ideographs that decompose canonically */
2373 nxCache[NX_CJK_COMPAT]=hasDecomp;
2374
2375 }
2376
2377 return nxCache[NX_CJK_COMPAT];
2378 }
2379
2380 private static final synchronized UnicodeSet internalGetNXUnicode(int options) {
2381 options &= OPTIONS_UNICODE_MASK;
2382 if(options==0) {
2383 return null;
2384 }
2385
2386 if(nxCache[options]==null) {
2387 /* build a set with all code points that were not designated by the specified Unicode version */
2388 UnicodeSet set = new UnicodeSet();
2389
2390 switch(options) {
2391 case NormalizerBase.UNICODE_3_2:
2392 set.applyPattern("[:^Age=3.2:]");
2393 break;
2394 default:
2395 return null;
2396 }
2397
2398 nxCache[options]=set;
2399 }
2400
2401 return nxCache[options];
2402 }
2403
2404 /* Get a decomposition exclusion set. The data must be loaded. */
2405 private static final synchronized UnicodeSet internalGetNX(int options) {
2406 options&=OPTIONS_SETS_MASK;
2407
2408 if(nxCache[options]==null) {
2409 /* return basic sets */
2410 if(options==NX_HANGUL) {
2411 return internalGetNXHangul();
2412 }
2413 if(options==NX_CJK_COMPAT) {
2414 return internalGetNXCJKCompat();
2415 }
2416 if((options & OPTIONS_UNICODE_MASK)!=0 && (options & OPTIONS_NX_MASK)==0) {
2417 return internalGetNXUnicode(options);
2418 }
2419
2420 /* build a set from multiple subsets */
2421 UnicodeSet set;
2422 UnicodeSet other;
2423
2424 set=new UnicodeSet();
2425
2426
2427 if((options & NX_HANGUL)!=0 && null!=(other=internalGetNXHangul())) {
2428 set.addAll(other);
2429 }
2430 if((options&NX_CJK_COMPAT)!=0 && null!=(other=internalGetNXCJKCompat())) {
2431 set.addAll(other);
2432 }
2433 if((options&OPTIONS_UNICODE_MASK)!=0 && null!=(other=internalGetNXUnicode(options))) {
2434 set.addAll(other);
2435 }
2436
2437 nxCache[options]=set;
2438 }
2439 return nxCache[options];
2440 }
2441
2442 public static final UnicodeSet getNX(int options) {
2443 if((options&=OPTIONS_SETS_MASK)==0) {
2444 /* incoming failure, or no decomposition exclusions requested */
2445 return null;
2446 } else {
2447 return internalGetNX(options);
2448 }
2449 }
2450
2451 private static final boolean nx_contains(UnicodeSet nx, int c) {
2452 return nx!=null && nx.contains(c);
2453 }
2454
2455 private static final boolean nx_contains(UnicodeSet nx, char c, char c2) {
2456 return nx!=null && nx.contains(c2==0 ? c : UCharacterProperty.getRawSupplementary(c, c2));
2457 }
2458
2459 /*****************************************************************************/
2460
2461 /**
2462 * Get the canonical decomposition
2463 * sherman for ComposedCharIter
2464 */
2465
2466 public static int getDecompose(int chars[], String decomps[]) {
2467 DecomposeArgs args = new DecomposeArgs();
2468 int length=0;
2469 long norm32 = 0;
2470 int ch = -1;
2471 int index = 0;
2472 int i = 0;
2473
2474 while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff
2475 //TBD !!!! the hack code heres save us about 50ms for startup
2476 //need a better solution/lookup
2477 if (ch == 0x30ff)
2478 ch = 0xf900;
2479 else if (ch == 0x10000)
2480 ch = 0x1d15e;
2481 else if (ch == 0x1d1c1)
2482 ch = 0x2f800;
2483
2484 norm32 = NormalizerImpl.getNorm32(ch);
2485 if((norm32 & QC_NFD)!=0 && i < chars.length) {
2486 chars[i] = ch;
2487 index = decompose(norm32, args);
2488 decomps[i++] = new String(extraData,index, args.length);
2489 }
2490 }
2491 return i;
2492 }
2493
2494 //------------------------------------------------------
2495 // special method for Collation
2496 //------------------------------------------------------
2497 private static boolean needSingleQuotation(char c) {
2498 return (c >= 0x0009 && c <= 0x000D) ||
2499 (c >= 0x0020 && c <= 0x002F) ||
2500 (c >= 0x003A && c <= 0x0040) ||
2501 (c >= 0x005B && c <= 0x0060) ||
2502 (c >= 0x007B && c <= 0x007E);
2503 }
2504
2505 public static String canonicalDecomposeWithSingleQuotation(String string) {
2506 char[] src = string.toCharArray();
2507 int srcIndex = 0;
2508 int srcLimit = src.length;
2509 char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3
2510 int destIndex = 0;
2511 int destLimit = dest.length;
2512
2513 char[] buffer = new char[3];
2514 int prevSrc;
2515 long norm32;
2516 int ccOrQCMask;
2517 int qcMask = QC_NFD;
2518 int reorderStartIndex, length;
2519 char c, c2;
2520 char minNoMaybe = (char)indexes[INDEX_MIN_NFD_NO_MAYBE];
2521 int cc, prevCC, trailCC;
2522 char[] p;
2523 int pStart;
2524
2525
2526 // initialize
2527 ccOrQCMask = CC_MASK | qcMask;
2528 reorderStartIndex = 0;
2529 prevCC = 0;
2530 norm32 = 0;
2531 c = 0;
2532 pStart = 0;
2533
2534 cc = trailCC = -1; // initialize to bogus value
2535 for(;;) {
2536 prevSrc=srcIndex;
2537 //quick check (1)less than minNoMaybe (2)no decomp (3)hangual
2538 while (srcIndex != srcLimit &&
2539 (( c = src[srcIndex]) < minNoMaybe ||
2540 ((norm32 = getNorm32(c)) & ccOrQCMask) == 0 ||
2541 ( c >= '\uac00' && c <= '\ud7a3'))){
2542
2543 prevCC = 0;
2544 ++srcIndex;
2545 }
2546
2547 // copy these code units all at once
2548 if (srcIndex != prevSrc) {
2549 length = (int)(srcIndex - prevSrc);
2550 if ((destIndex + length) <= destLimit) {
2551 System.arraycopy(src,prevSrc,dest,destIndex,length);
2552 }
2553
2554 destIndex += length;
2555 reorderStartIndex = destIndex;
2556 }
2557
2558 // end of source reached?
2559 if(srcIndex == srcLimit) {
2560 break;
2561 }
2562 // c already contains *src and norm32 is set for it, increment src
2563 ++srcIndex;
2564
2565 if(isNorm32Regular(norm32)) {
2566 c2 = 0;
2567 length = 1;
2568 } else {
2569 // c is a lead surrogate, get the real norm32
2570 if(srcIndex != srcLimit &&
2571 Character.isLowSurrogate(c2 = src[srcIndex])) {
2572 ++srcIndex;
2573 length = 2;
2574 norm32 = getNorm32FromSurrogatePair(norm32, c2);
2575 } else {
2576 c2 = 0;
2577 length = 1;
2578 norm32 = 0;
2579 }
2580 }
2581
2582 // get the decomposition and the lead and trail cc's
2583 if((norm32 & qcMask) == 0) {
2584 // c does not decompose
2585 cc = trailCC = (int)((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
2586 p = null;
2587 pStart = -1;
2588 } else {
2589 DecomposeArgs arg = new DecomposeArgs();
2590 // c decomposes, get everything from the variable-length
2591 // extra data
2592 pStart = decompose(norm32, qcMask, arg);
2593 p = extraData;
2594 length = arg.length;
2595 cc = arg.cc;
2596 trailCC = arg.trailCC;
2597 if(length == 1) {
2598 // fastpath a single code unit from decomposition
2599 c = p[pStart];
2600 c2 = 0;
2601 p = null;
2602 pStart = -1;
2603 }
2604 }
2605
2606 if((destIndex + length * 3) >= destLimit) { // 2 SingleQuotations
2607 // buffer overflow
2608 char[] tmpBuf = new char[destLimit * 2];
2609 System.arraycopy(dest, 0, tmpBuf, 0, destIndex);
2610 dest = tmpBuf;
2611 destLimit = dest.length;
2612 }
2613 // append the decomposition to the destination buffer, assume length>0
2614 {
2615 int reorderSplit = destIndex;
2616 if(p == null) {
2617 // fastpath: single code point
2618 if (needSingleQuotation(c)) {
2619 //if we need single quotation, no need to consider "prevCC"
2620 //and it must NOT be a supplementary pair
2621 dest[destIndex++] = '\'';
2622 dest[destIndex++] = c;
2623 dest[destIndex++] = '\'';
2624 trailCC = 0;
2625 } else if(cc != 0 && cc < prevCC) {
2626 // (c, c2) is out of order with respect to the preceding
2627 // text
2628 destIndex += length;
2629 trailCC = insertOrdered(dest,reorderStartIndex,
2630 reorderSplit, destIndex, c, c2, cc);
2631 } else {
2632 // just append (c, c2)
2633 dest[destIndex++] = c;
2634 if(c2 != 0) {
2635 dest[destIndex++] = c2;
2636 }
2637 }
2638 } else {
2639 // general: multiple code points (ordered by themselves)
2640 // from decomposition
2641 if (needSingleQuotation(p[pStart])) {
2642 dest[destIndex++] = '\'';
2643 dest[destIndex++] = p[pStart++];
2644 dest[destIndex++] = '\'';
2645 length--;
2646 do {
2647 dest[destIndex++] = p[pStart++];
2648 } while(--length > 0);
2649 } else
2650 if(cc != 0 && cc < prevCC) {
2651 destIndex += length;
2652 trailCC = mergeOrdered(dest,reorderStartIndex,
2653 reorderSplit,p, pStart,pStart+length);
2654 } else {
2655 // just append the decomposition
2656 do {
2657 dest[destIndex++] = p[pStart++];
2658 } while(--length > 0);
2659 }
2660 }
2661 }
2662 prevCC = trailCC;
2663 if(prevCC == 0) {
2664 reorderStartIndex = destIndex;
2665 }
2666 }
2667 return new String(dest, 0, destIndex);
2668 }
2669
2670 //------------------------------------------------------
2671 // mapping method for IDNA/StringPrep
2672 //------------------------------------------------------
2673
2674 /*
2675 * Normalization using NormalizerBase.UNICODE_3_2 option supports Unicode
2676 * 3.2 normalization with Corrigendum 4 corrections. However, normalization
2677 * without the corrections is necessary for IDNA/StringPrep support.
2678 * This method is called when NormalizerBase.UNICODE_3_2_0_ORIGINAL option
2679 * (= sun.text.Normalizer.UNICODE_3_2) is used and normalizes five
2680 * characters in Corrigendum 4 before normalization in order to avoid
2681 * incorrect normalization.
2682 * For the Corrigendum 4 issue, refer
2683 * http://www.unicode.org/versions/corrigendum4.html
2684 */
2685
2686 /*
2687 * Option used in NormalizerBase.UNICODE_3_2_0_ORIGINAL.
2688 */
2689 public static final int WITHOUT_CORRIGENDUM4_CORRECTIONS=0x40000;
2690
2691 private static final char[][] corrigendum4MappingTable = {
2692 {'\uD844', '\uDF6A'}, // 0x2F868
2693 {'\u5F33'}, // 0x2F874
2694 {'\u43AB'}, // 0x2F91F
2695 {'\u7AAE'}, // 0x2F95F
2696 {'\u4D57'}}; // 0x2F9BF
2697
2698 /*
2699 * Removing Corrigendum 4 fix
2700 * @return normalized text
2701 */
2702 public static String convert(String str) {
2703 if (str == null) {
2704 return null;
2705 }
2706
2707 int ch = UCharacterIterator.DONE;
2708 StringBuffer dest = new StringBuffer();
2709 UCharacterIterator iter = UCharacterIterator.getInstance(str);
2710
2711 while ((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
2712 switch (ch) {
2713 case 0x2F868:
2714 dest.append(corrigendum4MappingTable[0]);
2715 break;
2716 case 0x2F874:
2717 dest.append(corrigendum4MappingTable[1]);
2718 break;
2719 case 0x2F91F:
2720 dest.append(corrigendum4MappingTable[2]);
2721 break;
2722 case 0x2F95F:
2723 dest.append(corrigendum4MappingTable[3]);
2724 break;
2725 case 0x2F9BF:
2726 dest.append(corrigendum4MappingTable[4]);
2727 break;
2728 default:
2729 UTF16.append(dest,ch);
2730 break;
2731 }
2732 }
2733
2734 return dest.toString();
2735 }
2736 }